Analysis date: 2023-01-02
CRC_Xenografts_FirstBatch_DataProcessing Script
load("../Data/Cache/Xenografts_Batch1_DataProcessing.RData")
Tune_And_Perform_PLSDA <- function(X, Y, folds, repeats, scale = TRUE, upperlimitfeatures = 200, initial_nr_of_components = 10, progressBar = TRUE){
plsda <- splsda( X = X,
Y= Y, scale = scale, ncomp = initial_nr_of_components)
message("Done calculating initial PLSDA")
message("Tuning components:")
perf.plsda <- perf(plsda, validation = "Mfold",
folds = folds, nrepeat = repeats, # use repeated cross-validation
progressBar = progressBar, auc = TRUE) # include AUC values
plot(perf.plsda, col = color.mixo(5:7),
sd = TRUE,
legend.position = "horizontal")
#perf.plsda$choice.ncomp
message("Done tuning components")
# grid of possible keepX values that will be tested for each component
list.keepX <- c(1:20, seq(20, upperlimitfeatures, 10))
message("Tune number of selected features:")
# undergo the tuning process to determine the optimal number of variables
tune.plsda <- tune.splsda(X = X, Y= Y, scale = scale,
ncomp = perf.plsda$choice.ncomp[2,1], validation = 'Mfold',
folds = folds, nrepeat = repeats, # use repeated cross-validation
dist = 'max.dist', # use max.dist measure
measure = "BER", # use balanced error rate of dist measure
test.keepX = list.keepX,
progressBar = progressBar,
cpus = 2)
p_tune_feat <- plot(tune.plsda, col = color.jet(perf.plsda$choice.ncomp[2,1]))
print(p_tune_feat)
message(paste("Optimal number of components:", tune.plsda$choice.ncomp$ncomp ) )
message("Optimal number of selected features per component:")
print(tune.plsda$choice.keepX)
optimal.ncomp <- tune.plsda$choice.ncomp$ncomp
optimal.keepX <- tune.plsda$choice.keepX[1:optimal.ncomp]
message("Calculate final sPLSDA model")
final.splsda <- splsda(X, Y,
ncomp = optimal.ncomp,
keepX = optimal.keepX, scale = scale)
# perf.final.splsda <- perf(final.splsda,
# folds = folds, nrepeat = repeats, # use #repeated cross-validation
# validation = "Mfold", dist = "max.dist", # #use max.dist measure
# progressBar = FALSE, auc = TRUE)
# perf.final.splsda$auc
return(final.splsda)
}
Run_GSEA_PLSDA <- function(PLSDA_result, component){
ranked_list <-
PLSDA_result$loadings$X %>%
as.data.frame() %>%
rownames_to_column("peptide") %>%
as_tibble() %>%
separate(peptide, into = c("HGNC_Symbol", "Annotated_Sequence")) %>%
select(HGNC_Symbol, FC = {{ component }} ) %>%
mutate(absFC = abs(FC) ) %>%
group_by(HGNC_Symbol) %>%
arrange(desc(absFC) ) %>%
slice(1) %>%
ungroup %>%
arrange(FC)
ranked_vector <- unlist(ranked_list[,2])
names(ranked_vector) <- mapIds(org.Hs.eg.db,
keys = ranked_list$HGNC_Symbol,
column = "ENTREZID", keytype = "SYMBOL")
ranked_vector <- ranked_vector[!is.na(names(ranked_vector)) ]
pathways <- reactomePathways(names(ranked_vector))
fgseaRes <- fgsea(pathways, ranked_vector, maxSize=500)
#print(head(fgseaRes))
topPathwaysUp <- fgseaRes[ES > 0 & padj < 0.1][head(order(pval), n=10), pathway]
topPathwaysDown <- fgseaRes[ES < 0 & padj < 0.1][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
message(paste( length(topPathways), "pathways below FDR of 10%"))
if(length(topPathways) > 0 ){
plot.new()
ggplot() + theme_void()
plotGseaTable(pathways[topPathways], ranked_vector, fgseaRes,
gseaParam=0.5)
}
}
plsda_pY <- Tune_And_Perform_PLSDA(
X = pY_mat_normtomedian,
Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian ) ),
pattern = "_", simplify = TRUE )[,1] ),
repeats = 50, folds = 3, initial_nr_of_components = 4,
scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:
## Optimal number of components: 3
## Optimal number of selected features per component:
## comp1 comp2 comp3
## 90 4 50
## Calculate final sPLSDA model
plotIndiv(plsda_pY, comp = c(1,2), # plot samples from final model
col.per.group = PGPalette[c(1,2,4,5)],
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = ' (a) sPLS-DA on pY, comp 1 & 2')
plsda_pY$prop_expl_var
## $X
## comp1 comp2 comp3
## 0.36123832 0.07094834 0.20081173
##
## $Y
## comp1 comp2 comp3
## 0.3352754 0.3329572 0.2885741
plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
#plotLoadings(plsda_pY, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotVar(plsda_pY, cex =2, comp = c(1,2), var.names = list(stringr::str_split(plsda_pY$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#plotVar(plsda_pY, cex =2, comp = c(2,3), var.names = list(stringr::str_split(plsda_pY$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY, comp = 1)$name
Run_GSEA_PLSDA(PLSDA_result = plsda_pY, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## Loading required namespace: reactome.db
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Run_GSEA_PLSDA(PLSDA_result = plsda_pY, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-50. You can set the `eps` argument to zero for better estimation.
## 10 pathways below FDR of 10%
Plot_StringDB(
rownames(plsda_pY$loadings$X[ (plsda_pY$loadings$X[,1] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
Plot_StringDB(
rownames(plsda_pY$loadings$X[ (plsda_pY$loadings$X[,2] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
plsda_pY_prep1 <- Tune_And_Perform_PLSDA(
X = pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep1"]) ),],
Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep1"]) ),] ) ),
pattern = "_", simplify = TRUE )[,1] ),
repeats = 50, folds = 3, initial_nr_of_components = 4,
scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:
## Optimal number of components: 2
## Optimal number of selected features per component:
## comp1 comp2
## 18 30
## Calculate final sPLSDA model
plotIndiv(plsda_pY_prep1, comp = c(1,2), # plot samples from final model
col.per.group = PGPalette[c(1,2,4,5)],
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = ' (a) sPLS-DA on pY, comp 1 & 2')
plsda_pY_prep1$prop_expl_var
## $X
## comp1 comp2
## 0.35072718 0.09556697
##
## $Y
## comp1 comp2
## 0.3336348 0.3421679
plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
#plotLoadings(plsda_pY_prep1, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotVar(plsda_pY_prep1, cex =2, comp = c(1,2), var.names = list(stringr::str_split(plsda_pY_prep1$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#plotVar(plsda_pY_prep1, cex =2, comp = c(2,3), var.names = list(stringr::str_split(plsda_pY_prep1$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY_prep1, comp = 1)$name
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep1, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-50. You can set the `eps` argument to zero for better estimation.
## 2 pathways below FDR of 10%
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep1, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Plot_StringDB(
rownames(plsda_pY_prep1$loadings$X[ (plsda_pY_prep1$loadings$X[,1] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
Plot_StringDB(
rownames(plsda_pY_prep1$loadings$X[ (plsda_pY_prep1$loadings$X[,2] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
plsda_pY_prep2 <- Tune_And_Perform_PLSDA(
X = pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep2"]) ),],
Y = as.factor( str_split( gsub("log2FC_Xenograft_", "", rownames( pY_mat_normtomedian[paste0("log2FC_", names(prep_l[prep_l == "prep2"]) ),] ) ),
pattern = "_", simplify = TRUE )[,1] ),
repeats = 50, folds = 3, initial_nr_of_components = 4,
scale = TRUE, progressBar = FALSE)
## Done calculating initial PLSDA
## Tuning components:
## Done tuning components
## Tune number of selected features:
## Optimal number of components: 4
## Optimal number of selected features per component:
## comp1 comp2 comp3 comp4
## 110 11 8 3
## Calculate final sPLSDA model
plotIndiv(plsda_pY_prep2, comp = c(1,2), # plot samples from final model
col.per.group = PGPalette[c(1,2,4,5)],
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = ' (a) sPLS-DA on pY, comp 1 & 2')
plsda_pY_prep2$prop_expl_var
## $X
## comp1 comp2 comp3 comp4
## 0.35127891 0.11397880 0.04623242 0.04673407
##
## $Y
## comp1 comp2 comp3 comp4
## 0.3300180 0.2795119 0.3366914 0.2901813
plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 1, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 2, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
#plotLoadings(plsda_pY_prep2, method = 'mean', contrib = 'max', comp = 3, size.name = 0.5, size.title = 1, legend.color = PGPalette[c(1,2,4,5)])
plotVar(plsda_pY_prep2, cex =2, comp = c(1,2), var.names = list(stringr::str_split(plsda_pY_prep2$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#plotVar(plsda_pY_prep2, cex =2, comp = c(2,3), var.names = list(stringr::str_split(plsda_pY_prep2$names$colnames$X, pattern = "_", simplify = T )[,1] ) )
#selectVar(plsda_pY_prep2, comp = 1)$name
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep2, component = comp1)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Run_GSEA_PLSDA(PLSDA_result = plsda_pY_prep2, component = comp2)
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 0 pathways below FDR of 10%
Plot_StringDB(
rownames(plsda_pY_prep2$loadings$X[ (plsda_pY_prep2$loadings$X[,1] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
Plot_StringDB(
rownames(plsda_pY_prep2$loadings$X[ (plsda_pY_prep2$loadings$X[,2] != 0 ),] ) %>%
str_split(pattern = "_", simplify = T) %>% .[,1])
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_0.5.2 stringr_1.4.1
## [3] dplyr_1.0.10 purrr_0.3.5
## [5] readr_2.1.3 tidyr_1.2.1
## [7] tibble_3.1.8 tidyverse_1.3.2
## [9] mixOmics_6.18.1 ggplot2_3.3.6
## [11] lattice_0.20-45 MASS_7.3-58.1
## [13] mdatools_0.13.0 SummarizedExperiment_1.24.0
## [15] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [17] MatrixGenerics_1.6.0 matrixStats_0.62.0
## [19] DEP_1.16.0 org.Hs.eg.db_3.14.0
## [21] AnnotationDbi_1.56.2 IRanges_2.28.0
## [23] S4Vectors_0.32.4 Biobase_2.54.0
## [25] BiocGenerics_0.40.0 fgsea_1.20.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 shinydashboard_0.7.2 proto_1.0.0
## [4] gmm_1.7 tidyselect_1.2.0 RSQLite_2.2.18
## [7] htmlwidgets_1.5.4 grid_4.1.3 BiocParallel_1.28.3
## [10] norm_1.0-10.0 munsell_0.5.0 codetools_0.2-18
## [13] preprocessCore_1.56.0 chron_2.3-58 DT_0.26
## [16] withr_2.5.0 colorspace_2.0-3 highr_0.9
## [19] knitr_1.40 rstudioapi_0.14 mzID_1.32.0
## [22] labeling_0.4.2 GenomeInfoDbData_1.2.7 bit64_4.0.5
## [25] farver_2.1.1 vctrs_0.5.0 generics_0.1.3
## [28] xfun_0.34 R6_2.5.1 doParallel_1.0.17
## [31] clue_0.3-62 MsCoreUtils_1.6.2 bitops_1.0-7
## [34] cachem_1.0.6 DelayedArray_0.20.0 assertthat_0.2.1
## [37] promises_1.2.0.1 scales_1.2.1 googlesheets4_1.0.1
## [40] gtable_0.3.1 affy_1.72.0 sandwich_3.0-2
## [43] rlang_1.0.6 mzR_2.28.0 GlobalOptions_0.1.2
## [46] gargle_1.2.1 impute_1.68.0 broom_1.0.1
## [49] BiocManager_1.30.19 yaml_2.3.6 reshape2_1.4.4
## [52] modelr_0.1.9 backports_1.4.1 httpuv_1.6.6
## [55] tools_4.1.3 affyio_1.64.0 ellipsis_0.3.2
## [58] gplots_3.1.3 jquerylib_0.1.4 RColorBrewer_1.1-3
## [61] STRINGdb_2.6.5 MSnbase_2.20.4 gsubfn_0.7
## [64] Rcpp_1.0.9 hash_2.2.6.2 plyr_1.8.7
## [67] zlibbioc_1.40.0 RCurl_1.98-1.9 sqldf_0.4-11
## [70] GetoptLong_1.0.5 zoo_1.8-11 haven_2.5.1
## [73] ggrepel_0.9.1 cluster_2.1.4 fs_1.5.2
## [76] magrittr_2.0.3 data.table_1.14.4 RSpectra_0.16-1
## [79] circlize_0.4.15 reactome.db_1.77.0 reprex_2.0.2
## [82] googledrive_2.0.0 pcaMethods_1.86.0 mvtnorm_1.1-3
## [85] ProtGenerics_1.26.0 hms_1.1.2 mime_0.12
## [88] evaluate_0.17 xtable_1.8-4 XML_3.99-0.12
## [91] readxl_1.4.1 gridExtra_2.3 shape_1.4.6
## [94] compiler_4.1.3 ellipse_0.4.3 KernSmooth_2.23-20
## [97] ncdf4_1.19 crayon_1.5.2 htmltools_0.5.3
## [100] corpcor_1.6.10 later_1.3.0 tzdb_0.3.0
## [103] lubridate_1.8.0 DBI_1.1.3 dbplyr_2.2.1
## [106] ComplexHeatmap_2.10.0 tmvtnorm_1.5 Matrix_1.5-1
## [109] cli_3.4.1 vsn_3.62.0 imputeLCMD_2.1
## [112] parallel_4.1.3 igraph_1.3.5 pkgconfig_2.0.3
## [115] MALDIquant_1.21 xml2_1.3.3 foreach_1.5.2
## [118] rARPACK_0.11-0 bslib_0.4.0 XVector_0.34.0
## [121] rvest_1.0.3 digest_0.6.30 Biostrings_2.62.0
## [124] rmarkdown_2.17 cellranger_1.1.0 fastmatch_1.1-3
## [127] shiny_1.7.3 gtools_3.9.3 rjson_0.2.21
## [130] lifecycle_1.0.3 jsonlite_1.8.3 limma_3.50.3
## [133] fansi_1.0.3 pillar_1.8.1 KEGGREST_1.34.0
## [136] fastmap_1.1.0 httr_1.4.4 plotrix_3.8-2
## [139] glue_1.6.2 png_0.1-7 iterators_1.0.14
## [142] bit_4.0.4 stringi_1.7.8 sass_0.4.2
## [145] blob_1.2.3 caTools_1.18.2 memoise_2.0.1
knitr::knit_exit()